/*==============================================================================
FILE NAME: Convert_xlsx_to_dta.do
PURPOSE: To convert raw data files into .dta format
INPUTS: raw data files: PIR56575_item_1_2009&before_02012021.xlsx, PIR56575_item_1_part2_2010 and after.xlsx, PIR-56575-Report-4.xlsx, PIR-56575_item_5-part2.xlsx, PIR56575_item_7.xlsx, PIR 21-56575 (NSR Permits).xlsx, 
PIR 21-56575 (Title V Permits).xlsx, PIR56575_item_3.xlsx, PIR-56575-Report-4.xlsx,
PIR-56575_item_5-part1.xlsx, PIR-56575_item_5-part2.xlsx, PIR56575_item_7.xlsx, PIR 21-56575 (NSR Permits).xlsx, PIR 21-56575 (Title V Permits).xlsx
OUTPUTS: raw data files: PIR56575_item_1_2009_and_before.dta, PIR56575_item_1_part2_2010_and_after.dta, PIR56575_item_2_part1.dta, PIR56575_item_2_part2.dta, PIR56575_item_4_part1.dta, PIR56575_item_4_part2.dta,
PIR56575_item_4_part4.dta, PIR56575_item_5_part1.dta, PIR56575_item_5_part2.dta
processed data files: incidents.dta, investigations.dta, enforcement.dta, facility_characteristics.dta, Notices_of_violation.dta, Emissions_events.dta, NSR_Permits.dta, TitleV_Permits.dta
CREATED: 26 September 2023
UPDATED: 23 July 2025
==============================================================================*/

/* Set directory if working independently through code */
if c(username)=="" { //insert username
    global rootdir "" // insert root path
    global processed_data "$rootdir/processed_data"  // Define global paths for replication package
} 

// Allow import of large Excel files
set excelxlsxlargefile on

*** #1 - Incidents ***

// Import and convert incident data (2009 & before)
import excel "$raw_data/PIR56575_item_1_2009&before_02012021.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_1_2009_and_before.dta", replace

// Import and convert incident data (2010 & after)
import excel "$raw_data/PIR56575_item_1_part2_2010 and after.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_1_part2_2010_and_after.dta", replace

// Combine both incident datasets and save processed version
use "$raw_data/PIR56575_item_1_2009_and_before.dta", clear
append using "$raw_data/PIR56575_item_1_part2_2010_and_after.dta"
save "$processed_data/incidents.dta", replace

*** #2 - Investigations ***

// Import and convert investigations data from two sheets
import excel "$raw_data/PIR-56575_item_2.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_2_part1.dta", replace

import excel "$raw_data/PIR-56575_item_2.xlsx", sheet("Sheet 2") firstrow clear
save "$raw_data/PIR56575_item_2_part2.dta", replace

// Combine both investigations datasets and save processed version
use "$raw_data/PIR56575_item_2_part1.dta", clear
tostring InvestigationStatusDate, replace
append using "$raw_data/PIR56575_item_2_part2.dta"
save "$processed_data/investigations.dta", replace

*** #3 - Enforcement ***

// Import and convert enforcement data
import excel "$raw_data/PIR56575_item_3.xlsx", sheet("Sheet 1") firstrow clear
save "$processed_data/enforcement.dta", replace

*** #4 - Facility Characteristics ***

// Import and convert facility characteristics data from three sheets
import excel "$raw_data/PIR-56575-Report-4.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_4_part1.dta", replace

import excel "$raw_data/PIR-56575-Report-4.xlsx", sheet("Sheet 2") firstrow clear
save "$raw_data/PIR56575_item_4_part2.dta", replace

import excel "$raw_data/PIR-56575-Report-4.xlsx", sheet("Sheet 3") firstrow clear
save "$raw_data/PIR56575_item_4_part3.dta", replace

// Combine all facility characteristics datasets and save processed version
use "$raw_data/PIR56575_item_4_part1.dta", clear
append using "$raw_data/PIR56575_item_4_part2.dta"
append using "$raw_data/PIR56575_item_4_part3.dta"
save "$processed_data/facility_characteristics.dta", replace

*** #5 - Notices of Violation ***

// Import and convert notices of violation data from two files
import excel "$raw_data/PIR-56575_item_5-part1.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_5_part1.dta", replace

import excel "$raw_data/PIR-56575_item_5-part2.xlsx", sheet("Sheet 1") firstrow clear
save "$raw_data/PIR56575_item_5_part2.dta", replace

// Combine both notices of violation datasets and save processed version
use "$raw_data/PIR56575_item_5_part1.dta", clear
append using "$raw_data/PIR56575_item_5_part2.dta"
save "$processed_data/Notices_of_violation.dta", replace

*** #7 Emissions Events ***

// Import and convert emissions events data
import excel "$raw_data/PIR56575_item_7.xlsx", sheet("Sheet 1") firstrow clear
save "$processed_data/Emissions_events.dta", replace

** PERMITS **

// NSR Permits

// Import and convert NSR permits data
import excel "$raw_data/PIR 21-56575 (NSR Permits).xlsx", sheet("Sheet1") firstrow clear
save "$processed_data/NSR_Permits.dta", replace

// Title V Permits 

// Import and convert Title V permits data
import excel "$raw_data/PIR 21-56575 (Title V Permits).xlsx", sheet("Sheet1") firstrow clear
save "$processed_data/TitleV_Permits.dta", replace